import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from sklearn import tree
from sklearn import ensemble
import itertools
from sklearn import metrics
from sklearn import model_selection
import pvlib
import cs_detection
import utils
import visualize_plotly as visualize
sns.set_style("white")
matplotlib.rcParams['figure.figsize'] = (20., 8.)
from IPython.display import Image
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)
%load_ext autoreload
%autoreload 2
np.set_printoptions(precision=4)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
matplotlib.rcParams.update({'font.size': 16})
import warnings
warnings.filterwarnings(action='ignore')
plt.close('all')
# nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz')
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.scale_model()
detect_obj.calc_all_metrics()
nsrdb = detect_obj.df
# filter off night times, will skew data
nsrdb_nonight = nsrdb[nsrdb['Clearsky GHI pvlib'] > 0]
clear_pds = nsrdb_nonight[nsrdb_nonight['sky_status']]
cloudy_pds = nsrdb_nonight[~nsrdb_nonight['sky_status']]
print(np.mean(clear_pds['GHI/GHIcs mean']), np.std(clear_pds['GHI/GHIcs mean']), np.median(clear_pds['GHI/GHIcs mean']))
print(np.mean(cloudy_pds['GHI/GHIcs mean']), np.std(cloudy_pds['GHI/GHIcs mean']), np.median(cloudy_pds['GHI/GHIcs mean']))
fig, ax = plt.subplots()
sns.boxplot(data=[cloudy_pds['GHI/GHIcs mean'], clear_pds['GHI/GHIcs mean']], ax=ax)
_ = ax.set_ylabel('GHI / GHI$_\mathrm{CS}$ mean')
_ = ax.set_xticklabels(['cloudy', 'clear'])
_ = ax.set_xlabel('NSRDB labels')
cloudy_pds['GHI/GHIcs mean'].describe()
clear_pds['GHI/GHIcs mean'].describe()
Clear periods have relatively tight distribution near a ratio of 1. Some obvious low outliers need to be fixed/removed. Cloudy periods show much more spread which is expected. The mean and median of cloudy periods is quite a bit lower than clear periods (mean and median of clear ~.95, cloudy mean and median is .66 and .74). Based on the whiskers there might be mislabeled cloudy points, though these points could also have ratios near 1 but be in noisy periods.
fig, ax = plt.subplots()
bins = np.linspace(0, 1.2, 25)
ax.hist(clear_pds['GHI/GHIcs mean'], bins, label='clear', color='C1', alpha=.75)
ax.hist(cloudy_pds['GHI/GHIcs mean'], bins, label='cloudy', color='C0', alpha=.75)
ax.set_xlabel('GHI / GHI$_\mathrm{CS}$ mean')
_ = ax.legend()
_ = ax.set_ylabel('Frequency')
Another view of the same data. Clear periods have ratios very localized just under 1 while cloudy periods are spread out.
clear_list, cloudy_list, year_list = [], [], []
for year, grp in nsrdb_nonight.groupby([nsrdb_nonight.index.year]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI/GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI/GHIcs mean'])
year_list.append(year)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, sharey=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
_ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(year_list, rotation=45)
_ = ax[1].set_title('Cloudy')
_ = ax[1].set_ylim(0, 1.2)
ax[0].set_ylabel('GHI / GHI$_\mathrm{CS}$')
ax[1].set_ylabel('GHI / GHI$_\mathrm{CS}$')
_ = fig.tight_layout()
Visually it appears that each year behaves similarly for both cloudy and clear data.
clear_list, cloudy_list, year_list = [], [], []
for year, grp in nsrdb_nonight.groupby([nsrdb_nonight.index.year, nsrdb_nonight.index.month // 3]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI/GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI/GHIcs mean'])
year_list.append(year)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
_ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(year_list, rotation=90)
_ = ax[1].set_title('Cloudy')
_ = ax[1].set_ylim(0, 1.2)
ax[0].set_ylabel('GHI / GHI$_\mathrm{CS}$')
ax[1].set_ylabel('GHI / GHI$_\mathrm{CS}$')
_ = fig.tight_layout()
A finer view of the year-on-year behavior doesn't show any striking seasonal trends. During cloudy periods, we do notice that as time goes on, there are less 'outlier' points on the low end of the ratios.
clear_list, cloudy_list, tfn_list = [], [], []
for tfn, grp in nsrdb_nonight.groupby([nsrdb_nonight['abs(t-tnoon)']]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI/GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI/GHIcs mean'])
tfn_list.append(tfn)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
_ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(tfn_list, rotation=45)
_ = ax[1].set_title('Cloudy')
_ = ax[1].set_ylim(0, 1.2)
_ = ax[1].set_xlabel('Minutes from noon')
_ = ax[0].set_ylabel('GHI / GHI$_{\mathrm{CS}}$')
_ = ax[1].set_ylabel('GHI / GHI$_{\mathrm{CS}}$')
_ = fig.tight_layout()
As we move away from solar noon (defined as the point of maximum model irradiance), the clear labeled periods include more and more outliers with respect to the GHI/GHIcs ratio.
sample = nsrdb[nsrdb.index >= '01-01-2012']
fig, ax = plt.subplots(figsize=(16, 8))
_ = ax.plot(sample.index, sample['GHI'], label='GHI')
_ = ax.plot(sample.index, sample['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
a = ax.scatter(sample[sample['sky_status']].index, sample[sample['sky_status']]['GHI'],
c=sample[sample['sky_status']]['GHI/GHIcs mean'], label=None, zorder=10, cmap='seismic', s=10)
_ = ax.legend(loc='upper right')
_ = ax.set_title('NSRDB Clear periods')
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
cb = fig.colorbar(a)
cb.set_label('GHI / GHI$_{\mathrm{CS}}$')
For points labeled as clear by NSRDB, the early morning periods have a noticably lower ratio. This is visual confirmation of the previous plot.
fig, ax = plt.subplots(figsize=(16, 8))
_ = ax.plot(sample.index, sample['GHI'], label='GHI')
_ = ax.plot(sample.index, sample['Clearsky GHI pvlib'], label='GHIcs', alpha=0.5)
a = ax.scatter(sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0].index,
sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0]['GHI'],
c=sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0]['GHI/GHIcs mean'], zorder=10, cmap='seismic', s=10, label=None)
_ = ax.legend(loc='upper right')
_ = ax.set_title('NSRDB cloudy periods')
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
cb = fig.colorbar(a)
cb.set_label('GHI / GHI$_{\mathrm{CS}}$')
fig, ax = plt.subplots(figsize=(16, 8))
subsample = sample[(sample.index >= '03-22-2012') & (sample.index < '03-26-2012')]
_ = ax.plot(subsample.index, subsample['GHI'], label='GHI')
_ = ax.plot(subsample.index, subsample['Clearsky GHI pvlib'], label='GHIcs', alpha=1)
a = ax.scatter(subsample[(~subsample['sky_status']) & (subsample['Clearsky GHI pvlib'] > 0)].index,
subsample[(~subsample['sky_status']) & (subsample['Clearsky GHI pvlib'] > 0)]['GHI'],
c=subsample[(~subsample['sky_status']) & (subsample['Clearsky GHI pvlib'] > 0)]['GHI/GHIcs mean'], zorder=10, cmap='seismic', s=40, label=None)
_ = ax.legend(bbox_to_anchor=(.85, .85))
_ = ax.set_title('NSRDB cloudy periods')
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
cb = fig.colorbar(a)
cb.set_label('GHI / GHI$_{\mathrm{CS}}$')
There appear to be many clear points that are labeled as cloudy. There are many cases where there is a ratio near 1 during a 'noisy' period, which should not be labeled clear. We can't assume ratio is good enough. This seems to be a much bigger problem than clear points having a low ratio.
num_clear_good_ratio = len(nsrdb[(nsrdb['sky_status']) & (nsrdb['GHI/GHIcs mean'] >= .95) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_clear_bad_ratio = len(nsrdb[(nsrdb['sky_status']) & (nsrdb['GHI/GHIcs mean'] < .95) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_cloudy_good_ratio = len(nsrdb[(~nsrdb['sky_status']) & (nsrdb['GHI/GHIcs mean'] >= .95) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_cloudy_bad_ratio = len(nsrdb[(~nsrdb['sky_status']) & (nsrdb['GHI/GHIcs mean'] < .95) & (nsrdb['Clearsky GHI pvlib'] > 0)])
print('Clear periods, good ratio: {}'.format(num_clear_good_ratio))
print('Clear periods, bad ratio: {}'.format(num_clear_bad_ratio))
print()
print('Cloudy periods, good ratio: {}'.format(num_cloudy_good_ratio))
print('Cloudy periods, good ratio: {}'.format(num_cloudy_bad_ratio))
tmp = nsrdb[(nsrdb['sky_status']) & (nsrdb['GHI/GHIcs'] < .95)]
fig, ax = plt.subplots()
# bins = np.linspace(0, 1.2, 25)
ax.hist(tmp['abs(t-tnoon)'], label='clear', color='C1', alpha=.75)
# ax.hist(cloudy_pds['GHI/GHIcs'], bins, label='cloudy', color='C0', alpha=.75)
ax.set_xlabel('|t - t$_\mathrm{noon}$|')
ax.set_xlim((0, 450))
# _ = ax.legend()
_ = ax.set_ylabel('Frequency')
_ = ax.set_title('Clear with ratio < 0.95')
For NSRDB labeled clear periods:
For NSRDB labeled cloudy periods:
It would be wise to score classifiers based on their recall score, as the vast majority clear labels from NSRDB appear to be correct (based on the $\mathrm{GHI/GHI_{CS}}$ ratio). The no special method was used to choose the good/bad ratio cutoff of .9.
# filter off night times, will skew data
nsrdb_nonight = nsrdb[nsrdb['Clearsky GHI pvlib'] > 0]
clear_pds = nsrdb_nonight[nsrdb_nonight['sky_status']]
cloudy_pds = nsrdb_nonight[~nsrdb_nonight['sky_status']]
print(np.mean(clear_pds['GHI-GHIcs mean']), np.std(clear_pds['GHI-GHIcs mean']), np.median(clear_pds['GHI-GHIcs mean']))
print(np.mean(cloudy_pds['GHI-GHIcs mean']), np.std(cloudy_pds['GHI-GHIcs mean']), np.median(cloudy_pds['GHI-GHIcs mean']))
fig, ax = plt.subplots()
sns.boxplot(data=[cloudy_pds['GHI-GHIcs mean'], clear_pds['GHI-GHIcs mean']], ax=ax)
_ = ax.set_ylabel('GHI - GHI$_\mathrm{CS}$ mean')
_ = ax.set_xticklabels(['cloudy', 'clear'])
_ = ax.set_xlabel('NSRDB label')
pd.DataFrame(cloudy_pds['GHI-GHIcs mean'].describe())
clear_pds['GHI-GHIcs mean'].describe()
Clear periods have relatively tight distribution near a ratio of 1. Some obvious low outliers need to be fixed/removed. Cloudy periods show much more spread which is expected. The mean and median of cloudy periods is quite a bit lower than clear periods (mean and median of clear ~.95, cloudy mean and median is .66 and .74). Based on the whiskers there might be mislabeled cloudy points, though these points could also have ratios near 1 but be in noisy periods.
fig, ax = plt.subplots()
# sns.distplot(clear_pds['GHI/GHIcs'], label='clear', ax=ax, color='C1')
# sns.distplot(cloudy_pds['GHI/GHIcs'], label='cloudy', ax=ax, color='C0')
# bins = np.linspace(0, 1.2, 25)
ax.hist(clear_pds['GHI-GHIcs mean'], label='clear', color='C1', alpha=.75)
ax.hist(cloudy_pds['GHI-GHIcs mean'], label='cloudy', color='C0', alpha=.75)
ax.set_xlabel('GHI - GHI$_\mathrm{CS}$ mean')
_ = ax.legend()
_ = ax.set_ylabel('Frequency')
Another view of the same data. Clear periods have ratios very localized just under 1 while cloudy periods are spread out.
clear_list, cloudy_list, year_list = [], [], []
for year, grp in nsrdb_nonight.groupby([nsrdb_nonight.index.year]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI-GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI-GHIcs mean'])
year_list.append(year)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, sharey=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
# _ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(year_list, rotation=45)
_ = ax[1].set_title('Cloudy')
# _ = ax[1].set_ylim(0, 1.2)
ax[0].set_ylabel('GHI - GHI$_\mathrm{CS}$ mean')
ax[1].set_ylabel('GHI - GHI$_\mathrm{CS}$ mean')
_ = fig.tight_layout()
Visually it appears that each year behaves similarly for both cloudy and clear data.
clear_list, cloudy_list, year_list = [], [], []
for year, grp in nsrdb_nonight.groupby([nsrdb_nonight.index.year, nsrdb_nonight.index.month // 3]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI-GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI-GHIcs mean'])
year_list.append(year)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
# _ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(year_list, rotation=90)
_ = ax[1].set_title('Cloudy')
# _ = ax[1].set_ylim(0, 1.2)
ax[0].set_ylabel('GHI - GHI$_\mathrm{CS}$ mean')
ax[1].set_ylabel('GHI - GHI$_\mathrm{CS}$ mean')
_ = fig.tight_layout()
A finer view of the year-on-year behavior doesn't show any striking seasonal trends. During cloudy periods, we do notice that as time goes on, there are less 'outlier' points on the low end of the ratios.
clear_list, cloudy_list, tfn_list = [], [], []
for tfn, grp in nsrdb_nonight.groupby([nsrdb_nonight['abs(t-tnoon)']]):
clear_pds = grp[grp['sky_status']]
cloudy_pds = grp[~grp['sky_status']]
clear_list.append(clear_pds['GHI-GHIcs mean'])
cloudy_list.append(cloudy_pds['GHI-GHIcs mean'])
tfn_list.append(tfn)
fig, ax = plt.subplots(ncols=1, nrows=2, sharex=True, figsize=(16, 8))
sns.boxplot(data=clear_list, ax=ax[0], color='C1')
_ = ax[0].set_title('Clear')
# _ = ax[0].set_ylim(0, 1.2)
sns.boxplot(data=cloudy_list, ax=ax[1], color='C0')
_ = ax[1].set_xticklabels(tfn_list, rotation=45)
_ = ax[1].set_title('Cloudy')
# _ = ax[1].set_ylim(0, 1.2)
_ = ax[1].set_xlabel('Minutes from noon')
_ = ax[0].set_ylabel('GHI - GHI$_{\mathrm{CS}}$ mean')
_ = ax[1].set_ylabel('GHI - GHI$_{\mathrm{CS}}$ mean')
_ = fig.tight_layout()
As we move away from solar noon (defined as the point of maximum model irradiance), the clear labeled periods include more and more outliers with respect to the GHI/GHIcs ratio.
sample = nsrdb[nsrdb.index >= '01-01-2012']
sample = nsrdb[(nsrdb.index >= '03-01-2012') & (nsrdb.index < '03-15-2012')]
fig, ax = plt.subplots(figsize=(24, 8))
_ = ax.plot(sample.index, sample['GHI'], label='GHI')
_ = ax.plot(sample.index, sample['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
a = ax.scatter(sample[sample['sky_status']].index, sample[sample['sky_status']]['GHI'],
c=sample[sample['sky_status']]['GHI-GHIcs mean'], label=None, zorder=10, cmap='Reds', s=80)
_ = ax.legend(bbox_to_anchor=(1.25, .98))
_ = ax.set_title('NSRDB Clear periods')
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
cb = fig.colorbar(a)
cb.set_label('GHI - GHI$_{\mathrm{CS}}$')
For points labeled as clear by NSRDB, the early morning periods have a noticably lower ratio. This is visual confirmation of the previous plot.
sample = nsrdb[(nsrdb.index >= '03-01-2012') & (nsrdb.index < '03-15-2012')]
fig, ax = plt.subplots(figsize=(24, 8))
_ = ax.plot(sample.index, sample['GHI'], label='GHI')
_ = ax.plot(sample.index, sample['Clearsky GHI pvlib'], label='GHIcs', alpha=1)
a = ax.scatter(sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0].index,
sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0]['GHI'],
c=sample[~sample['sky_status'] & sample['Clearsky GHI pvlib'] > 0]['GHI-GHIcs mean'], zorder=10, cmap='Reds', s=80, label=None)
_ = ax.legend(bbox_to_anchor=(1.25, .98))
_ = ax.set_title('NSRDB cloudy periods')
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
cb = fig.colorbar(a)
cb.set_label('GHI - GHI$_{\mathrm{CS}}$')
num_clear_good_diff = len(nsrdb[(nsrdb['sky_status']) & (np.abs(nsrdb['GHI-GHIcs mean']) <= 50) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_clear_bad_diff = len(nsrdb[(nsrdb['sky_status']) & (np.abs(nsrdb['GHI-GHIcs mean']) > 50) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_cloudy_good_diff = len(nsrdb[(~nsrdb['sky_status']) & (np.abs(nsrdb['GHI-GHIcs mean']) <= 50) & (nsrdb['Clearsky GHI pvlib'] > 0)])
num_cloudy_bad_diff = len(nsrdb[(~nsrdb['sky_status']) & (np.abs(nsrdb['GHI-GHIcs mean']) > 50) & (nsrdb['Clearsky GHI pvlib'] > 0)])
print('Clear periods, good diff: {}'.format(num_clear_good_diff))
print('Clear periods, bad diff: {}'.format(num_clear_bad_diff))
print()
print('Cloudy periods, good diff: {}'.format(num_cloudy_good_diff))
print('Cloudy periods, good diff: {}'.format(num_cloudy_bad_diff))
There appear to be many clear points that are labeled as cloudy. There are many cases where there is a ratio near 1 during a 'noisy' period, which should not be labeled clear. We can't assume ratio is good enough. This seems to be a much bigger problem than clear points having a low ratio.
# nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz')
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)
clf = train_obj.fit_model(clf)
pred = test_obj.predict(clf)
print(metrics.accuracy_score(test_obj.df['sky_status'], pred))
print(metrics.recall_score(test_obj.df['sky_status'], pred))
cm = metrics.confusion_matrix(test_obj.df['sky_status'], pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
fig, ax = plt.subplots(figsize=(12, 8))
_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)
_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')
_ = fig.tight_layout()
fig, ax = plt.subplots(figsize=(12, 8))
nsrdb_mask = test_obj.df['sky_status'].values
ax.plot(test_obj.df.index, test_obj.df['GHI'], label='GHI', alpha=.5)
ax.plot(test_obj.df.index, test_obj.df['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
ax.scatter(test_obj.df[pred & ~nsrdb_mask].index, test_obj.df[pred & ~nsrdb_mask]['GHI'], label='RF only', zorder=10, s=10)
ax.scatter(test_obj.df[nsrdb_mask & ~pred].index, test_obj.df[nsrdb_mask & ~pred]['GHI'], label='NSRDB only', zorder=10, s=10)
ax.scatter(test_obj.df[nsrdb_mask & pred].index, test_obj.df[nsrdb_mask & pred]['GHI'], label='Both', zorder=10, s=10)
_ = ax.legend(bbox_to_anchor=(1.25, 1))
Clean data by setting some cutoffs (by hand). For this study, GHI/GHIcs mean has to be >= .9 and the coefficient of variance must be less than .1. Also need to limit GHI-GHIcs (due to low irradiance periods) to
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.filter_labels()
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
print(metrics.accuracy_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]))
print(metrics.recall_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]))
print(metrics.recall_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]))
print(metrics.accuracy_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]))
cm = metrics.confusion_matrix(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
fig, ax = plt.subplots(figsize=(12, 8))
_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)
_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')
_ = fig.tight_layout()
fig, ax = plt.subplots(figsize=(24, 8))
nsrdb_mask = test_obj.df['sky_status'].values
ax.plot(test_obj.df.index, test_obj.df['GHI'], label='GHI', alpha=.5)
ax.plot(test_obj.df.index, test_obj.df['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
ax.scatter(test_obj.df[nsrdb_mask & ~pred].index, test_obj.df[nsrdb_mask & ~pred]['GHI'], label='NSRDB only', zorder=10, s=50)
ax.scatter(test_obj.df[pred & ~nsrdb_mask].index, test_obj.df[pred & ~nsrdb_mask]['GHI'], label='RF only', zorder=10, s=50)
ax.scatter(test_obj.df[nsrdb_mask & pred].index, test_obj.df[nsrdb_mask & pred]['GHI'], label='Both', zorder=10, s=50)
_ = ax.legend(bbox_to_anchor=(1.25, 1))
_ = ax.set_xlabel('Date')
_ = ax.set_ylabel('GHI / Wm$^{-2}$')
# fig, ax = plt.subplots(figsize=(12, 8))
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers')
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers')
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers')
# _ = ax.legend(bbox_to_anchor=(1.25, 1))
# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])
print(len(test_obj.df[nsrdb_mask & ~pred]))
Thus far, we have trained on default and cleaned data. When scoring these methods, we have not cleaned the testing set. This needs to be done to provide a fair comparison between cleaned and default data sets. We will also score between default-trained/cleaned-testing and cleaned-trained/default-testing data sets.
dflt_detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
dflt_detect_obj.df.index = dflt_detect_obj.df.index.tz_convert('MST')
dflt_train_obj = cs_detection.ClearskyDetection(dflt_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
dflt_train_obj.trim_dates(None, '01-01-2015')
dflt_test_obj = cs_detection.ClearskyDetection(dflt_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
dflt_test_obj.trim_dates('01-01-2015', None)
dflt_model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)
dflt_model = dflt_train_obj.fit_model(dflt_model)
dflt_pred = dflt_test_obj.predict(dflt_model)
np.bincount(dflt_test_obj.df['mask'])
clean_detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
clean_detect_obj.df.index = clean_detect_obj.df.index.tz_convert('MST')
clean_train_obj = cs_detection.ClearskyDetection(clean_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
clean_train_obj.trim_dates(None, '01-01-2015')
clean_test_obj = cs_detection.ClearskyDetection(clean_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
clean_test_obj.trim_dates('01-01-2015', None)
clean_model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)
clean_model = clean_train_obj.fit_model(clean_model, ratio_mean_val=0.95, diff_mean_val=50)
clean_pred = clean_test_obj.predict(clean_model)
clean_test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
np.bincount(clean_test_obj.df['mask'])
true = dflt_test_obj.df['sky_status']
pred = dflt_pred
print('accuracy: {}'.format(metrics.accuracy_score(true, pred)))
print('precision: {}'.format(metrics.precision_score(true, pred)))
print('recall: {}'.format(metrics.recall_score(true, pred)))
print('f1: {}'.format(metrics.f1_score(true, pred)))
cm = metrics.confusion_matrix(true, pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on default trained and default scored')
true = clean_test_obj.df[clean_test_obj.df['mask']]['sky_status']
pred = dflt_pred[clean_test_obj.df['mask']]
print('accuracy: {}'.format(metrics.accuracy_score(true, pred)))
print('precision: {}'.format(metrics.precision_score(true, pred)))
print('recall: {}'.format(metrics.recall_score(true, pred)))
print('f1: {}'.format(metrics.f1_score(true, pred)))
cm = metrics.confusion_matrix(true, pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on default trained and cleaned scored')
true = dflt_test_obj.df['sky_status']
pred = clean_pred
print('accuracy: {}'.format(metrics.accuracy_score(true, pred)))
print('precision: {}'.format(metrics.precision_score(true, pred)))
print('recall: {}'.format(metrics.recall_score(true, pred)))
print('f1: {}'.format(metrics.f1_score(true, pred)))
cm = metrics.confusion_matrix(true, pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and default scored')
true = clean_test_obj.df[clean_test_obj.df['mask']]['sky_status']
pred = clean_pred[clean_test_obj.df['mask']]
print('accuracy: {}'.format(metrics.accuracy_score(true, pred)))
print('precision: {}'.format(metrics.precision_score(true, pred)))
print('recall: {}'.format(metrics.recall_score(true, pred)))
print('f1: {}'.format(metrics.f1_score(true, pred)))
cm = metrics.confusion_matrix(true, pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(clean_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates('01-01-2010', '01-01-2015')
test_obj = cs_detection.ClearskyDetection(clean_detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)
scores = train_obj.cross_val_score(clf, scoring='f1')
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf)
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status']
cm = metrics.confusion_matrix(true, pred)
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.90, 'diff_mean_val': 100}, filter_fit=True, filter_score=True)
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.90, 'diff_mean_val': 100})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status'][test_obj.df['mask']]
cm = metrics.confusion_matrix(true, pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.93, 'diff_mean_val': 70}, filter_fit=True, filter_score=True)
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.93, 'diff_mean_val': 70})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status'][test_obj.df['mask']]
cm = metrics.confusion_matrix(true, pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.95, 'diff_mean_val': 50}, filter_fit=True, filter_score=True)
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.95, 'diff_mean_val': 50})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status'][test_obj.df['mask']]
cm = metrics.confusion_matrix(true, pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.97, 'diff_mean_val': 30}, filter_fit=True, filter_score=True)
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.97, 'diff_mean_val': 30})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status'][test_obj.df['mask']]
cm = metrics.confusion_matrix(true, pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.99, 'diff_mean_val': 10}, filter_fit=True, filter_score=True)
print('{} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.99, 'diff_mean_val': 10})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])
true = test_obj.df['sky_status'][test_obj.df['mask']]
cm = metrics.confusion_matrix(true, pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'), title='RF performance on clean trained and clean scored')
for ml in [10, 20, 30]:
for nest in [32]:
clf = ensemble.RandomForestClassifier(n_jobs=-1, min_samples_leaf=ml, n_estimators=nest)
scores = train_obj.cross_val_score(clf, scoring='f1', filter_kwargs={'ratio_mean_val': 0.99, 'diff_mean_val': 10}, filter_fit=True, filter_score=True)
print('n_estimators: {}, min_samples_leaf: {}'.format(nest, ml))
print(' {} +/- {}'.format(np.mean(scores), np.std(scores)))
clf = ensemble.RandomForestClassifier(n_jobs=-1, min_samples_leaf=20, n_estimators=32)
clf = train_obj.fit_model(clf, **{'ratio_mean_val': 0.99, 'diff_mean_val': 10})
pred = test_obj.predict(clf)
nsrdb_mask = test_obj.df['sky_status'].values
trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
iplot([trace1, trace2, trace3, trace4, trace5])